{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/data/pufanyi/anaconda3/anacondabin/envs/live_bench/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "Downloading readme: 100%|██████████| 2.45k/2.45k [00:00<00:00, 5.08MB/s]\n",
      "Downloading data: 100%|██████████| 88.6M/88.6M [00:04<00:00, 19.6MB/s]\n",
      "Generating test split: 100%|██████████| 320/320 [00:00<00:00, 384.23 examples/s]\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "dataset = load_dataset(\"lmms-lab/LiveBench\", \"2024-07\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    test: Dataset({\n",
       "        features: ['id', 'images', 'website', 'question', 'answer', 'criteria', 'subtask', 'data_generator', 'checker', 'date_time', 'screen_shoter', 'screen_size', 'score', 'reason', 'scorer_name'],\n",
       "        num_rows: 320\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>images</th>\n",
       "      <th>website</th>\n",
       "      <th>question</th>\n",
       "      <th>answer</th>\n",
       "      <th>criteria</th>\n",
       "      <th>subtask</th>\n",
       "      <th>data_generator</th>\n",
       "      <th>checker</th>\n",
       "      <th>date_time</th>\n",
       "      <th>screen_shoter</th>\n",
       "      <th>screen_size</th>\n",
       "      <th>score</th>\n",
       "      <th>reason</th>\n",
       "      <th>scorer_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
       "      <td>{'url': 'https://www.bbc.com/news'}</td>\n",
       "      <td>Examine the top menu bar of the BBC website di...</td>\n",
       "      <td>C) Weather</td>\n",
       "      <td>Award 10 points for selecting option C) Weathe...</td>\n",
       "      <td>Basic Understanding</td>\n",
       "      <td>claude</td>\n",
       "      <td>gemini</td>\n",
       "      <td>2024-07-20 14:02:22</td>\n",
       "      <td>single_screen</td>\n",
       "      <td>(1024, 1024)</td>\n",
       "      <td>10</td>\n",
       "      <td>The answer is correct and can be directly veri...</td>\n",
       "      <td>claude</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
       "      <td>{'url': 'https://www.bbc.com/news'}</td>\n",
       "      <td>Based on the image accompanying the article ab...</td>\n",
       "      <td>The image depicts a crowded airport setting, l...</td>\n",
       "      <td>{'10 points': 'The answer correctly identifies...</td>\n",
       "      <td>Deeper Implications</td>\n",
       "      <td>claude</td>\n",
       "      <td>gemini</td>\n",
       "      <td>2024-07-20 14:02:22</td>\n",
       "      <td>single_screen</td>\n",
       "      <td>(1024, 1024)</td>\n",
       "      <td>10</td>\n",
       "      <td>The image clearly shows that the news article ...</td>\n",
       "      <td>claude</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
       "      <td>{'url': 'https://www.bbc.com/news'}</td>\n",
       "      <td>Based on the photograph accompanying the headl...</td>\n",
       "      <td>The photograph shows a crowded scene inside an...</td>\n",
       "      <td>{'2 points': 'Interprets the overall scene as ...</td>\n",
       "      <td>Contextual Analysis</td>\n",
       "      <td>claude</td>\n",
       "      <td>gemini</td>\n",
       "      <td>2024-07-20 14:02:22</td>\n",
       "      <td>single_screen</td>\n",
       "      <td>(1024, 1024)</td>\n",
       "      <td>10</td>\n",
       "      <td>The answer accurately describes the image and ...</td>\n",
       "      <td>claude</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
       "      <td>{'url': 'https://www.bbc.com/news'}</td>\n",
       "      <td>Considering the headline 'Global services slow...</td>\n",
       "      <td>This incident underscores several critical iss...</td>\n",
       "      <td>Award up to 10 marks: 1.5 points for discussin...</td>\n",
       "      <td>Deeper Implications</td>\n",
       "      <td>claude</td>\n",
       "      <td>None</td>\n",
       "      <td>2024-07-20 14:02:22</td>\n",
       "      <td>single_screen</td>\n",
       "      <td>(1024, 1024)</td>\n",
       "      <td>10</td>\n",
       "      <td>The answer provides an insightful analysis of ...</td>\n",
       "      <td>claude</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
       "      <td>{'url': 'https://www.bbc.com/news'}</td>\n",
       "      <td>Analyze the layout and image selection of the ...</td>\n",
       "      <td>The layout of the BBC News homepage utilizes v...</td>\n",
       "      <td>{'10 points': 'The answer provides a detailed ...</td>\n",
       "      <td>Contextual Analysis</td>\n",
       "      <td>claude</td>\n",
       "      <td>gemini</td>\n",
       "      <td>2024-07-20 14:02:22</td>\n",
       "      <td>single_screen</td>\n",
       "      <td>(1024, 1024)</td>\n",
       "      <td>8</td>\n",
       "      <td>The ranking provided is generally accurate and...</td>\n",
       "      <td>claude</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>315</th>\n",
       "      <td>315</td>\n",
       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
       "      <td>{'name': 'bloomberg.comeconomics.png'}</td>\n",
       "      <td>Analyze the image accompanying the article 'Xi...</td>\n",
       "      <td>The image accompanying the article suggests a ...</td>\n",
       "      <td>Scoring Criteria (Total: 10 points)\\n\\n- Ident...</td>\n",
       "      <td>Contextual Analysis</td>\n",
       "      <td>gpt4v</td>\n",
       "      <td>gemini</td>\n",
       "      <td>2024-07-21 20:23:39</td>\n",
       "      <td>human</td>\n",
       "      <td>None</td>\n",
       "      <td>6</td>\n",
       "      <td>While the question is thoughtful and relevant ...</td>\n",
       "      <td>claude</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>316</th>\n",
       "      <td>316</td>\n",
       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
       "      <td>{'name': 'wsj.comworldafrica.png'}</td>\n",
       "      <td>Analyze the image accompanying the article 'Qu...</td>\n",
       "      <td>The image contains several key elements that s...</td>\n",
       "      <td>Scoring Criteria (Total: 10 points):\\n\\n1. Ide...</td>\n",
       "      <td>Contextual Analysis</td>\n",
       "      <td>gpt4v</td>\n",
       "      <td>gemini</td>\n",
       "      <td>2024-07-21 20:27:57</td>\n",
       "      <td>human</td>\n",
       "      <td>None</td>\n",
       "      <td>6</td>\n",
       "      <td>While the question is based on information pro...</td>\n",
       "      <td>claude</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>317</th>\n",
       "      <td>317</td>\n",
       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
       "      <td>{'name': 'wsj.comworldafrica.png'}</td>\n",
       "      <td>Based on the articles shown on the Wall Street...</td>\n",
       "      <td>The two most closely related news stories are:...</td>\n",
       "      <td>Scoring Criteria (Total 10 points):\\n1. Correc...</td>\n",
       "      <td>Contextual Analysis</td>\n",
       "      <td>gpt4v</td>\n",
       "      <td>gemini</td>\n",
       "      <td>2024-07-21 20:27:57</td>\n",
       "      <td>human</td>\n",
       "      <td>None</td>\n",
       "      <td>7</td>\n",
       "      <td>This question requires analysis and interpreta...</td>\n",
       "      <td>claude</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>318</th>\n",
       "      <td>318</td>\n",
       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
       "      <td>{'name': 'wsj.comworldafrica.png'}</td>\n",
       "      <td>Analyze the image and headlines related to the...</td>\n",
       "      <td>The image and headlines present a complex narr...</td>\n",
       "      <td>Scoring Criteria (Total: 10 points)\\n\\n1. Anal...</td>\n",
       "      <td>Contextual Analysis</td>\n",
       "      <td>gpt4v</td>\n",
       "      <td>gemini</td>\n",
       "      <td>2024-07-21 20:27:57</td>\n",
       "      <td>human</td>\n",
       "      <td>None</td>\n",
       "      <td>6</td>\n",
       "      <td>This question and answer are not directly supp...</td>\n",
       "      <td>claude</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>319</th>\n",
       "      <td>319</td>\n",
       "      <td>[{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...</td>\n",
       "      <td>{'name': 'wsj.comworldafrica.png'}</td>\n",
       "      <td>Analyze the image accompanying the article tit...</td>\n",
       "      <td>The image contains several key visual elements...</td>\n",
       "      <td>Scoring Criteria (Total 10 points):\\n\\n- Ident...</td>\n",
       "      <td>Basic Understanding</td>\n",
       "      <td>gpt4v</td>\n",
       "      <td>gemini</td>\n",
       "      <td>2024-07-21 20:27:57</td>\n",
       "      <td>human</td>\n",
       "      <td>None</td>\n",
       "      <td>5</td>\n",
       "      <td>This answer is not directly supported by the i...</td>\n",
       "      <td>claude</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>320 rows × 15 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      id                                             images  \\\n",
       "0      0  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
       "1      1  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
       "2      2  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
       "3      3  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
       "4      4  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
       "..   ...                                                ...   \n",
       "315  315  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
       "316  316  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
       "317  317  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
       "318  318  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
       "319  319  [{'bytes': b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIH...   \n",
       "\n",
       "                                    website  \\\n",
       "0       {'url': 'https://www.bbc.com/news'}   \n",
       "1       {'url': 'https://www.bbc.com/news'}   \n",
       "2       {'url': 'https://www.bbc.com/news'}   \n",
       "3       {'url': 'https://www.bbc.com/news'}   \n",
       "4       {'url': 'https://www.bbc.com/news'}   \n",
       "..                                      ...   \n",
       "315  {'name': 'bloomberg.comeconomics.png'}   \n",
       "316      {'name': 'wsj.comworldafrica.png'}   \n",
       "317      {'name': 'wsj.comworldafrica.png'}   \n",
       "318      {'name': 'wsj.comworldafrica.png'}   \n",
       "319      {'name': 'wsj.comworldafrica.png'}   \n",
       "\n",
       "                                              question  \\\n",
       "0    Examine the top menu bar of the BBC website di...   \n",
       "1    Based on the image accompanying the article ab...   \n",
       "2    Based on the photograph accompanying the headl...   \n",
       "3    Considering the headline 'Global services slow...   \n",
       "4    Analyze the layout and image selection of the ...   \n",
       "..                                                 ...   \n",
       "315  Analyze the image accompanying the article 'Xi...   \n",
       "316  Analyze the image accompanying the article 'Qu...   \n",
       "317  Based on the articles shown on the Wall Street...   \n",
       "318  Analyze the image and headlines related to the...   \n",
       "319  Analyze the image accompanying the article tit...   \n",
       "\n",
       "                                                answer  \\\n",
       "0                                           C) Weather   \n",
       "1    The image depicts a crowded airport setting, l...   \n",
       "2    The photograph shows a crowded scene inside an...   \n",
       "3    This incident underscores several critical iss...   \n",
       "4    The layout of the BBC News homepage utilizes v...   \n",
       "..                                                 ...   \n",
       "315  The image accompanying the article suggests a ...   \n",
       "316  The image contains several key elements that s...   \n",
       "317  The two most closely related news stories are:...   \n",
       "318  The image and headlines present a complex narr...   \n",
       "319  The image contains several key visual elements...   \n",
       "\n",
       "                                              criteria              subtask  \\\n",
       "0    Award 10 points for selecting option C) Weathe...  Basic Understanding   \n",
       "1    {'10 points': 'The answer correctly identifies...  Deeper Implications   \n",
       "2    {'2 points': 'Interprets the overall scene as ...  Contextual Analysis   \n",
       "3    Award up to 10 marks: 1.5 points for discussin...  Deeper Implications   \n",
       "4    {'10 points': 'The answer provides a detailed ...  Contextual Analysis   \n",
       "..                                                 ...                  ...   \n",
       "315  Scoring Criteria (Total: 10 points)\\n\\n- Ident...  Contextual Analysis   \n",
       "316  Scoring Criteria (Total: 10 points):\\n\\n1. Ide...  Contextual Analysis   \n",
       "317  Scoring Criteria (Total 10 points):\\n1. Correc...  Contextual Analysis   \n",
       "318  Scoring Criteria (Total: 10 points)\\n\\n1. Anal...  Contextual Analysis   \n",
       "319  Scoring Criteria (Total 10 points):\\n\\n- Ident...  Basic Understanding   \n",
       "\n",
       "    data_generator checker            date_time  screen_shoter   screen_size  \\\n",
       "0           claude  gemini  2024-07-20 14:02:22  single_screen  (1024, 1024)   \n",
       "1           claude  gemini  2024-07-20 14:02:22  single_screen  (1024, 1024)   \n",
       "2           claude  gemini  2024-07-20 14:02:22  single_screen  (1024, 1024)   \n",
       "3           claude    None  2024-07-20 14:02:22  single_screen  (1024, 1024)   \n",
       "4           claude  gemini  2024-07-20 14:02:22  single_screen  (1024, 1024)   \n",
       "..             ...     ...                  ...            ...           ...   \n",
       "315          gpt4v  gemini  2024-07-21 20:23:39          human          None   \n",
       "316          gpt4v  gemini  2024-07-21 20:27:57          human          None   \n",
       "317          gpt4v  gemini  2024-07-21 20:27:57          human          None   \n",
       "318          gpt4v  gemini  2024-07-21 20:27:57          human          None   \n",
       "319          gpt4v  gemini  2024-07-21 20:27:57          human          None   \n",
       "\n",
       "     score                                             reason scorer_name  \n",
       "0       10  The answer is correct and can be directly veri...      claude  \n",
       "1       10  The image clearly shows that the news article ...      claude  \n",
       "2       10  The answer accurately describes the image and ...      claude  \n",
       "3       10  The answer provides an insightful analysis of ...      claude  \n",
       "4        8  The ranking provided is generally accurate and...      claude  \n",
       "..     ...                                                ...         ...  \n",
       "315      6  While the question is thoughtful and relevant ...      claude  \n",
       "316      6  While the question is based on information pro...      claude  \n",
       "317      7  This question requires analysis and interpreta...      claude  \n",
       "318      6  This question and answer are not directly supp...      claude  \n",
       "319      5  This answer is not directly supported by the i...      claude  \n",
       "\n",
       "[320 rows x 15 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset[\"test\"].to_pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "live_bench",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
